import scanpy as sc
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scvelo as scv
import scanpy.external as sce
import os
PROJECT_DIR = "/home/sisaev/projects/Gustafsson_et_al_2022"
DATA_DIR = "/home/sisaev/data/thymus_Karin/count"
exec(open(f"{PROJECT_DIR}/tools/tools.py").read())
sc.settings.verbosity = 3
sns.set(font_scale=1)
sc.settings.set_figure_params(dpi=150)
sns.set_style("white")
Reds = get_beautiful_cmap()
sample_type = {
"SCG_MTH2_IR" : "IR",
"ID_MTH5_IR" : "IR",
"SCG_MTH7_IR" : "IR",
"SCG_65" : "control",
"SCG_MTH2_C" : "control",
"SCG_MTH3_C" : "control",
"SCG_MTH4_C" : "control",
"ID_MTH9_Old_M1" : "aged",
"ID_MTH9_Old_M2" : "aged",
"ID_MTH9_Old_F3" : "aged",
"ID_MTH9_Old_F4" : "aged",
"SCG_MTH3_IL7_C" : "Il7ko",
"ID_MTH5_IL7_C" : "Il7ko",
"SCG_MTH8_IL7_C1" : "Il7ko",
"SCG_MTH8_IL7_C2" : "Il7ko"
}
%load_ext rpy2.ipython
The rpy2.ipython extension is already loaded. To reload it, use: %reload_ext rpy2.ipython
%%R
library(ggplot2)
library(DescTools)
library(cowplot)
library(patchwork)
library(conos)
library(cacoa)
adata = sc.read_h5ad(f"{PROJECT_DIR}/data/Mouse_AllCells.h5ad")
os.mkdir(f"{PROJECT_DIR}/data/Mouse_AllCells_pagoda2_conos/")
adata = run_conos_pagoda2(
adata,
batch_key="sample_id",
script_path=f"{PROJECT_DIR}/tools/conos_wrapper.R",
tmp_dir=f"{PROJECT_DIR}/data/Mouse_AllCells_pagoda2_conos/"
)
(1/3) Writing .h5ad-file (2/3) Running wrapper (it may takes a long time) (3/3) Reading conos outputs
sc.tl.leiden(adata, resolution=3)
fig, axes = plt.subplots(ncols=2, dpi=150, figsize=(11, 5))
sc.pl.umap(adata, color=["leiden"], frameon=False, title="Clusters", ax=axes[0], show=False)
sc.pl.umap(adata, color=["condition"], frameon=False, title="Condition", ax=axes[1], show=False)
fig.suptitle("Conos integration")
fig.tight_layout()
running Leiden clustering
finished: found 29 clusters and added
'leiden', the cluster labels (adata.obs, categorical) (0:00:20)
sc.pl.umap(
adata,
cmap=Reds,
ncols=4,
frameon=False,
color=[
"Cd3e", "Cd4", "Cd8a", "Ptprc", # HC
"Fabp4", "Cd36", "Egfl7", "Pecam1", # EC
"Gsn", "Dpt", "Dcn", "Col1a2", # MC
"Rgs5", "Ifitm1", "Kcnj8", "Acta2", # PV, aPVx2, pcPV
"Nkain4", "Upk3b", "Lrrn4", "Krt19", # Lrrn4+
"Perp", "Ctsl", "Ubd", "Cd52", # TEC Ax2, TEC Bx2
"Ptpn18", "Lrmp", "Avil", "Fyb", # Tuft cells
"Dct", "Pax3", "Mitf", "Fabp7", # NC
"Acta2", "Cpe", "Tagln", "Myl9" # Smooth muscle cells
]
)
ct_to_leiden = {
"HC" : ["0", "1", "4", "12", "21", "22", "25"],
"MC" : ["3", "5", "6", "7", "9", "10", "11", "14"],
"EC" : ["2", "19"],
"pcPV" : ["8"],
"aPV" : ["16"],
"NC" : ["13"],
"TEC A" : ["15", "18"],
"TEC B" : ["20"],
"Lrrn4+" : ["17"],
"SM" : ["24"],
"Tuft cells" : ["23"],
"Unknown" : ["26", "27", "28"]
}
paper_cmap = {
"EC" : "#71fdea",
"TEC A" : "#1d25f3",
"TEC B" : "#dcae3d",
"pcPV" : "#ae0b09",
"aPV" : "#fda936",
"Tuft cells" : "#a1fb73",
"Lrrn4+" : "#f443a9",
"MC" : "#6b04d4",
"NC" : "#34be1e",
"HC" : "#ff0000",
"SM" : "#00573e",
"Penk+ Cdh11+ MC" : "#00b3f4",
"Cd248+ MC" : "#008af6",
"Postn+ MC" : "#ed00ff"
}
leiden_to_ct = {}
for ct in ct_to_leiden:
for leiden in ct_to_leiden[ct]:
leiden_to_ct[leiden] = ct
adata.obs["cell_type"] = [leiden_to_ct[leiden] for leiden in adata.obs.leiden]
adata = adata[adata.obs.cell_type != "Unknown"]
adata.obs["cell_type"] = adata.obs["cell_type"].astype("category")
adata.uns["cell_type_colors"] = [paper_cmap[cell_type] for cell_type in adata.obs["cell_type"].cat.categories]
sc.pl.umap(adata, color="cell_type", frameon=False, title="Murine thymus")
Trying to set attribute `.obs` of view, copying.
adata_MC = adata[adata.obs.cell_type == "MC"].copy()
adata_MC.X = adata_MC.layers["counts"].copy()
os.mkdir(f"{PROJECT_DIR}/data/Mouse_MC_pagoda2_conos/")
adata = run_conos_pagoda2(
adata,
batch_key="sample_id",
n_hvg=1000,
n_comps=15,
script_path=f"{PROJECT_DIR}/tools/conos_wrapper.R",
tmp_dir=f"{PROJECT_DIR}/data/Mouse_MC_pagoda2_conos/"
)
(1/3) Writing .h5ad-file (2/3) Running wrapper (it may takes a long time) (3/3) Reading conos outputs
sc.tl.leiden(adata_MC, resolution=0.4)
fig, axes = plt.subplots(ncols=2, dpi=150, figsize=(11, 5))
sc.pl.umap(adata_MC, color=["leiden"], frameon=False, title="Clusters", ax=axes[0], show=False)
sc.pl.umap(adata_MC, color=["condition"], frameon=False, title="Condition", ax=axes[1], show=False)
fig.suptitle("Conos integration (number HVG = 1000)")
fig.tight_layout()
running Leiden clustering
finished: found 3 clusters and added
'leiden', the cluster labels (adata.obs, categorical) (0:00:03)
scv.pl.umap(
adata_MC,
color=[
"Cd248", "Cdh11", "Penk", "Postn",
"Ccl19", "Flt3l", "Il15"
],
cmap=Reds,
ncols=4
)
MC_annotation = {
"0" : "Penk+ Cdh11+ MC",
"1" : "Cd248+ MC",
"2" : "Postn+ MC"
}
adata_MC.obs["cell_type_l2"] = [MC_annotation[leiden] for leiden in adata_MC.obs.leiden]
adata_MC.obs["cell_type_l2"] = adata_MC.obs["cell_type_l2"].astype("category")
adata_MC.uns["cell_type_l2_colors"] = [paper_cmap[cell_type] for cell_type in adata_MC.obs["cell_type_l2"].cat.categories]
fig, axes = plt.subplots(ncols=2, dpi=150, figsize=(9, 4))
sc.pl.umap(adata_MC, color=["cell_type_l2"], title=["All samples"], frameon=False, ax=axes[0], show=False, legend_loc=None)
sc.pl.umap(adata_MC[adata_MC.obs.condition == "control"], color=["cell_type_l2"], title=["Only control samples"], frameon=False, ax=axes[1], show=False)
MC_expression_control_ccl19 = pd.melt(get_mean_expression(adata_MC[adata_MC.obs.condition == "control"], gene="Ccl19", groupby="cell_type_l2", splitby="sample_id"))
MC_expression_control_flt3l = pd.melt(get_mean_expression(adata_MC[adata_MC.obs.condition == "control"], gene="Flt3l", groupby="cell_type_l2", splitby="sample_id"))
MC_expression_control_il15 = pd.melt(get_mean_expression(adata_MC[adata_MC.obs.condition == "control"], gene="Il15", groupby="cell_type_l2", splitby="sample_id"))
adata.obs["cell_type_l2"] = adata.obs["cell_type"].copy()
cell_type_l2 = []
for MC_barcode in adata.obs.index:
if MC_barcode in adata_MC.obs.index:
cell_type_l2.append(adata_MC.obs.loc[MC_barcode]["cell_type_l2"])
else:
cell_type_l2.append(adata.obs.loc[MC_barcode]["cell_type"])
adata.obs["cell_type_l2"] = cell_type_l2
adata.obs["cell_type_l2"] = adata.obs["cell_type_l2"].astype("category")
adata.uns["cell_type_l2_colors"] = [paper_cmap[cell_type] for cell_type in adata.obs["cell_type_l2"].cat.categories]
del cell_type_l2
sc.pl.umap(adata, color="cell_type_l2", frameon=False, title="Murine thymus")
adata_SC = adata.copy(); adata_SC = adata_SC[adata_SC.obs.cell_type != "HC"]
adata_SC.X = adata_SC.layers["counts"].copy()
os.mkdir(f"{PROJECT_DIR}/data/Mouse_SC_pagoda2_conos/")
adata = run_conos_pagoda2(
adata_SC,
batch_key="sample_id",
script_path=f"{PROJECT_DIR}/tools/conos_wrapper.R",
tmp_dir=f"{PROJECT_DIR}/data/Mouse_SC_pagoda2_conos/"
)
(1/3) Writing .h5ad-file (2/3) Running wrapper (it may takes a long time) (3/3) Reading conos outputs
adata_SC.uns["cell_type_colors"] = [paper_cmap[cell_type] for cell_type in adata_SC.obs["cell_type"].cat.categories]
adata_SC.uns["cell_type_l2_colors"] = [paper_cmap[cell_type] for cell_type in adata_SC.obs["cell_type_l2"].cat.categories]
fig, axes = plt.subplots(ncols=2, dpi=150, figsize=(11, 5))
sc.pl.umap(adata_SC, color=["cell_type_l2"], frameon=False, title="Clusters", ax=axes[0], show=False)
sc.pl.umap(adata_SC, color=["condition"], frameon=False, title="Condition", ax=axes[1], show=False)
fig.suptitle("Murine stromal cells")
fig.tight_layout()
adata_SC.layers["z-score"] = adata_SC.X.copy()
sc.pp.scale(adata_SC, layer="z-score")
adata_SC.obs.cell_type = adata_SC.obs.cell_type.cat.reorder_categories([
"EC", "MC", "aPV", "pcPV", "SM", "TEC A", "TEC B", "Tuft cells", "Lrrn4+", "NC"
])
markers = {
"EC" : ["Pecam1"],
"MC" : ["Prrx1"],
"aPV" : ["Kcnj8", "Rgs5"],
"pcPV" : ["Acta2"],
"MS" : ["Tagln", "Sost"],
"TEC A" : ["Ccl21a", "Ccl25"],
"TEC B" : ["Ubd"],
"Tuft cells" : ["Trpm5"],
"Lrrn4+" : ["Lrrn4"],
"NC" : ["Dct", "Mitf"],
}
sns.set_style("ticks")
sc.pl.dotplot(
adata_SC,
var_names=sum(markers.values(), []),
groupby="cell_type",
cmap="coolwarm",
layer="z-score",
vmin=-2,
vmax=2,
colorbar_title="Mean Z-score",
title="Murine stroma markers"
)
annotation_df = adata.obs.copy()
stromal_annotation_df = adata_SC.obs.copy()
MC_annotation_df = adata_MC.obs.copy()
%%R -i annotation_df
all.con <- readRDS("/home/sisaev/projects/Gustafsson_et_al_2022/data/Mouse_AllCells_pagoda2_conos/conos.rds")
sample.groups <- setNames(
c("Control", "Control", "Control", "Control",
"Aged", "Aged", "Aged", "Aged",
"Transplantation", "Transplantation", "Transplantation",
"IL7_KO", "IL7_KO", "IL7_KO", "IL7_KO"),
c("SCG_65", "SCG_MTH2_C", "SCG_MTH3_C", "SCG_MTH4_C",
"ID_MTH9_Old_M1", "ID_MTH9_Old_F3", "ID_MTH9_Old_M2", "ID_MTH9_Old_F4",
"SCG_MTH2_IR", "ID_MTH5_IR", "SCG_MTH7_IR",
"SCG_MTH3_IL7_C", "ID_MTH5_IL7_C", "SCG_MTH8_IL7_C1", "SCG_MTH8_IL7_C2")
)
all.cao <- Cacoa$new(
all.con,
cell.groups = setNames(object = annotation_df$cell_type_l2, rownames(annotation_df)),
sample.groups = sample.groups,
ref.level = "Control",
target.level = "Aged",
embedding = all.con$embeddings$UMAP
)
all.cao$estimateCellDensity()
%%R -w 2500 -h 500 -r 100
p0 <- all.cao$plotEmbedding(
color.by = "cell.groups",
title = "Cell type",
show.legend = FALSE,
font.size = c(3, 4)
)
pl <- all.cao$plotCellDensity(
contours = c("Postn+ MC", "Cd248+ MC"),
add.points = TRUE,
show.grid = FALSE,
show.cell.groups = FALSE
)
plot_grid(p0, plotlist = pl, nrow = 1)
%%R -i stromal_annotation_df
stromal.con <- readRDS("/home/sisaev/projects/Gustafsson_et_al_2022/data/Mouse_SC_pagoda2_conos/conos.rds")
stromal.cao <- Cacoa$new(
stromal.con,
cell.groups = setNames(object = stromal_annotation_df$cell_type_l2, rownames(stromal_annotation_df)),
sample.groups = sample.groups,
ref.level = "Control",
target.level = "Aged",
embedding = stromal.con$embeddings$UMAP
)
stromal.cao$estimateCellDensity()
%%R -w 2500 -h 500 -r 100
p0 <- stromal.cao$plotEmbedding(
color.by = "cell.groups",
title = "Cell type",
show.legend = FALSE,
font.size = c(3, 4)
)
pl <- stromal.cao$plotCellDensity(
contours = c("Postn+ MC", "Cd248+ MC"),
add.points = TRUE,
show.grid = FALSE,
show.cell.groups = FALSE
)
plot_grid(p0, plotlist = pl, nrow = 1)
%%R -i MC_annotation_df
MC.con <- readRDS("/home/sisaev/projects/Gustafsson_et_al_2022/data/Mouse_MC_pagoda2_conos/conos.rds")
MC.cao <- Cacoa$new(
MC.con,
cell.groups = setNames(object = MC_annotation_df$cell_type_l2, rownames(MC_annotation_df)),
sample.groups = sample.groups,
ref.level = "Control",
target.level = "Aged",
embedding = MC.con$embeddings$UMAP
)
MC.cao$estimateCellDensity()
%%R -w 2500 -h 500 -r 100
p0 <- MC.cao$plotEmbedding(
color.by = "cell.groups",
title = "Cell type",
show.legend = FALSE,
font.size = c(3, 4)
)
pl <- MC.cao$plotCellDensity(
contours = c("Postn+ MC", "Cd248+ MC"),
add.points = TRUE,
show.grid = FALSE,
show.cell.groups = FALSE
)
plot_grid(p0, plotlist = pl, nrow = 1)
import progeny
model = progeny.load_model(organism="Mouse", top=1000)
progeny.run(
adata_MC,
model,
center=True,
num_perm=100,
norm=True,
scale=True,
use_raw=False,
min_size=5
)
9193 genes found
100%|██████████| 100/100 [00:19<00:00, 5.16it/s]
adata_MC_progeny = progeny.extract(adata_MC)
sc.pl.umap(adata_MC, color="cell_type_l2", title="Cell type", frameon=False)
scv.pl.umap(adata_MC_progeny, color=adata_MC_progeny.var.index, vmin=-4, vmax=4, cmap="coolwarm", ncols=3)
adata.write_h5ad(f"{PROJECT_DIR}/data/Mouse_AllCells_conos.h5ad")
adata_MC.write_h5ad(f"{PROJECT_DIR}/data/Mouse_MC_conos.h5ad")
adata_SC.write_h5ad(f"{PROJECT_DIR}/data/Mouse_SC_conos.h5ad")
adata_MC = sc.read_h5ad(f"{PROJECT_DIR}/data/Mouse_MC_conos.h5ad")
Postn_DE = get_DESeq_multigroup(adata_MC[adata_MC.obs.cell_type_l2 == "Postn+ MC"])
Penk_DE = get_DESeq_multigroup(adata_MC[adata_MC.obs.cell_type_l2 == "Penk+ Cdh11+ MC"])
R[write to console]: estimating size factors R[write to console]: estimating dispersions R[write to console]: gene-wise dispersion estimates R[write to console]: mean-dispersion relationship R[write to console]: final dispersion estimates R[write to console]: fitting model and testing R[write to console]: estimating size factors R[write to console]: estimating dispersions R[write to console]: gene-wise dispersion estimates R[write to console]: mean-dispersion relationship R[write to console]: final dispersion estimates R[write to console]: fitting model and testing
gene_list = ["Pla2g7", "Fst", "Ltb"]
target_df = {}
for gene in gene_list:
target_df[f"{gene} in Postn+ MC"] = []
if gene == "Fst":
target_df[f"{gene} in Penk+ Cdh11+ MC"] = []
for condition in Penk_DE["DE"]["control"].keys():
target_df[f"{gene} in Postn+ MC"].append(Postn_DE["DE"]["control"][condition][Postn_DE["DE"]["control"][condition].id == gene].pvalue[0])
if gene == "Fst":
target_df[f"{gene} in Penk+ Cdh11+ MC"].append(Penk_DE["DE"]["control"][condition][Penk_DE["DE"]["control"][condition].id == gene].pvalue[0])
target_df = pd.DataFrame(target_df, index=Penk_DE["DE"]["control"].keys())
target_df.index = "control vs. " + target_df.index
correct_all_pvalues(target_df, correction_method="fdr_bh")
| Pla2g7 in Postn+ MC | Fst in Postn+ MC | Fst in Penk+ Cdh11+ MC | Ltb in Postn+ MC | |
|---|---|---|---|---|
| control vs. aged | 2.370061e-07 | 1.225114e-04 | 1.084776e-07 | 0.000001 |
| control vs. IL7_ko | 1.039121e-05 | 1.006220e-06 | 7.804656e-01 | 0.009288 |
| control vs. irradiated | 9.403827e-07 | 9.766650e-16 | 2.335442e-09 | 0.004818 |
df_counts = Postn_DE["normalized counts"].loc[gene_list].T
df_counts.columns = df_counts.columns + " in Postn+ MC"
df_counts["Fst in Penk+ Cdh11+ MC"] = Penk_DE["normalized counts"].loc["Fst"]
df_counts["condition"] = [sample_type[sample] for sample in df_counts.index]
df_counts.sort_values(by="condition")
| id | Pla2g7 in Postn+ MC | Fst in Postn+ MC | Ltb in Postn+ MC | Fst in Penk+ Cdh11+ MC | condition |
|---|---|---|---|---|---|
| SCG_MTH7_IR | 2.510310 | 190.783545 | 0.836770 | 738.926142 | IR |
| SCG_MTH2_IR | 11.760537 | 143.086534 | 2.940134 | 651.435120 | IR |
| ID_MTH5_IR | 3.311865 | 277.534328 | 0.662373 | 798.950796 | IR |
| SCG_MTH3_IL7_C | 1.081882 | 25.965164 | 3.245645 | 222.752857 | Il7ko |
| SCG_MTH8_IL7_C2 | 8.069227 | 43.035880 | 0.000000 | 130.020282 | Il7ko |
| SCG_MTH8_IL7_C1 | 3.895647 | 77.912934 | 0.000000 | 90.250264 | Il7ko |
| ID_MTH5_IL7_C | 9.104354 | 109.252246 | 0.000000 | 280.378797 | Il7ko |
| ID_MTH9_Old_M1 | 76.367157 | 52.226779 | 0.464238 | 731.188820 | aged |
| ID_MTH9_Old_F3 | 76.338225 | 22.265316 | 0.000000 | 366.352298 | aged |
| ID_MTH9_Old_F4 | 66.656577 | 38.173370 | 0.293641 | 413.445806 | aged |
| ID_MTH9_Old_M2 | 60.624156 | 34.254007 | 0.000000 | 775.025348 | aged |
| SCG_MTH2_C | 31.275571 | 1.954723 | 17.592509 | 197.657716 | control |
| SCG_MTH3_C | 21.945732 | 1.995067 | 26.933399 | 110.395868 | control |
| SCG_65 | 18.504015 | 16.653613 | 5.551204 | 202.341148 | control |
| SCG_MTH4_C | 27.814065 | 16.150103 | 1.794456 | 200.268994 | control |
adata_SC = sc.read_h5ad(f"{PROJECT_DIR}/data/Mouse_SC_conos.h5ad")
adata_SC.obs["sample_cell_type_l2"] = adata_SC.obs.sample_id.astype(str) + " | " + adata_SC.obs.cell_type_l2.astype(str)
adata_SC.obs["sample_cell_type"] = adata_SC.obs.sample_id.astype(str) + " | " + adata_SC.obs.cell_type.astype(str)
MC_DE = get_DESeq_multigroup(adata_SC[adata_SC.obs.condition == "control"],
split_by="sample_cell_type",
grouping="cell_type")
R[write to console]: Note: levels of factors in the design contain characters other than letters, numbers, '_' and '.'. It is recommended (but not required) to use only letters, numbers, and delimiters '_' or '.', as these are safe characters for column names in R. [This is a message, not a warning or an error] R[write to console]: estimating size factors R[write to console]: Note: levels of factors in the design contain characters other than letters, numbers, '_' and '.'. It is recommended (but not required) to use only letters, numbers, and delimiters '_' or '.', as these are safe characters for column names in R. [This is a message, not a warning or an error] R[write to console]: estimating dispersions R[write to console]: gene-wise dispersion estimates R[write to console]: mean-dispersion relationship R[write to console]: -- note: fitType='parametric', but the dispersion trend was not well captured by the function: y = a/x + b, and a local regression fit was automatically substituted. specify fitType='local' or 'mean' to avoid this message next time. R[write to console]: Note: levels of factors in the design contain characters other than letters, numbers, '_' and '.'. It is recommended (but not required) to use only letters, numbers, and delimiters '_' or '.', as these are safe characters for column names in R. [This is a message, not a warning or an error] R[write to console]: final dispersion estimates R[write to console]: Note: levels of factors in the design contain characters other than letters, numbers, '_' and '.'. It is recommended (but not required) to use only letters, numbers, and delimiters '_' or '.', as these are safe characters for column names in R. [This is a message, not a warning or an error] R[write to console]: fitting model and testing
target_df = {}
target_genes = ["Ccl19", "Flt3l", "Il15"]
for gene in target_genes:
target_df[gene] = []
for celltype in MC_DE["DE"]["MC"].keys():
target_df[gene].append(MC_DE["DE"]["MC"][celltype][MC_DE["DE"]["MC"][celltype].id == gene].pvalue[0])
target_df = pd.DataFrame(target_df, index=MC_DE["DE"]["MC"].keys())
target_df.index = "MC vs. " + target_df.index
#target_df
correct_all_pvalues(target_df, correction_method="fdr_bh")
| Ccl19 | Flt3l | Il15 | |
|---|---|---|---|
| MC vs. NC | 0.028205 | 0.616840 | 0.810263 |
| MC vs. aPV | 0.255979 | 0.261241 | 0.402547 |
| MC vs. TEC A | 0.653715 | 0.003588 | 0.483246 |
| MC vs. pcPV | 0.000105 | 0.004981 | 0.254943 |
| MC vs. Lrrn4+ | 0.003723 | 0.276793 | 0.255979 |
| MC vs. TEC B | 0.402547 | 0.255749 | 0.606837 |
| MC vs. EC | 0.000029 | 0.035626 | 0.483246 |
| MC vs. Tuft cells | 0.000029 | 0.035626 | 0.052373 |
df_counts = MC_DE["normalized counts"].loc[target_genes].T
df_counts["sample_id"] = [index.split(" | ")[0] for index in df_counts.index]
df_counts["cell_type"] = [index.split(" | ")[1] for index in df_counts.index]
df_counts.index = range(len(df_counts))
#df_counts.sort_values(by="cell_type")
adata_MC = sc.read_h5ad(f"{PROJECT_DIR}/data/Mouse_MC_conos.h5ad")
adata_MC.obs["sample_cell_type_l2"] = adata_MC.obs.sample_id.astype(str) + " | " + adata_MC.obs.cell_type_l2.astype(str)
only_MC_DE = get_DESeq_multigroup(
adata_MC[adata_MC.obs.condition == "control"],
split_by="sample_cell_type_l2",
grouping="cell_type_l2"
)
R[write to console]: Note: levels of factors in the design contain characters other than letters, numbers, '_' and '.'. It is recommended (but not required) to use only letters, numbers, and delimiters '_' or '.', as these are safe characters for column names in R. [This is a message, not a warning or an error] R[write to console]: estimating size factors R[write to console]: Note: levels of factors in the design contain characters other than letters, numbers, '_' and '.'. It is recommended (but not required) to use only letters, numbers, and delimiters '_' or '.', as these are safe characters for column names in R. [This is a message, not a warning or an error] R[write to console]: estimating dispersions R[write to console]: gene-wise dispersion estimates R[write to console]: mean-dispersion relationship R[write to console]: Note: levels of factors in the design contain characters other than letters, numbers, '_' and '.'. It is recommended (but not required) to use only letters, numbers, and delimiters '_' or '.', as these are safe characters for column names in R. [This is a message, not a warning or an error] R[write to console]: final dispersion estimates R[write to console]: Note: levels of factors in the design contain characters other than letters, numbers, '_' and '.'. It is recommended (but not required) to use only letters, numbers, and delimiters '_' or '.', as these are safe characters for column names in R. [This is a message, not a warning or an error] R[write to console]: fitting model and testing
target_df = {}
target_genes = ["Ccl19", "Flt3l", "Il15"]
for gene in target_genes:
target_df[gene] = []
for celltype in only_MC_DE["DE"]["Postn+ MC"].keys():
target_df[gene].append(only_MC_DE["DE"]["Postn+ MC"][celltype][only_MC_DE["DE"]["Postn+ MC"][celltype].id == gene].pvalue[0])
target_df = pd.DataFrame(target_df, index=only_MC_DE["DE"]["Postn+ MC"].keys())
target_df.index = "Postn+ MC vs. " + target_df.index
#target_df
correct_all_pvalues(target_df, correction_method="fdr_bh")
| Ccl19 | Flt3l | Il15 | |
|---|---|---|---|
| Postn+ MC vs. Penk+ Cdh11+ MC | 5.371824e-02 | 2.841298e-04 | 6.256898e-05 |
| Postn+ MC vs. Cd248+ MC | 1.895364e-18 | 1.076371e-11 | 2.468738e-09 |
df_counts = only_MC_DE["normalized counts"].loc[target_genes].T
df_counts["sample_id"] = [index.split(" | ")[0] for index in df_counts.index]
df_counts["cell_type"] = [index.split(" | ")[1] for index in df_counts.index]
df_counts.index = range(len(df_counts))
df_counts.sort_values(by="cell_type")
| id | Ccl19 | Flt3l | Il15 | sample_id | cell_type |
|---|---|---|---|---|---|
| 1 | 10.814181 | 16.993714 | 6.522840 | SCG_MTH4_C | Cd248+ MC |
| 5 | 6.235403 | 18.706210 | 9.353105 | SCG_MTH2_C | Cd248+ MC |
| 6 | 0.967251 | 9.672506 | 5.803504 | SCG_MTH3_C | Cd248+ MC |
| 11 | 4.544374 | 19.086370 | 5.453248 | SCG_65 | Cd248+ MC |
| 2 | 112.513586 | 27.063402 | 11.527004 | SCG_MTH4_C | Penk+ Cdh11+ MC |
| 4 | 100.687029 | 26.315928 | 9.153366 | SCG_MTH3_C | Penk+ Cdh11+ MC |
| 9 | 387.132805 | 54.293015 | 14.163395 | SCG_MTH2_C | Penk+ Cdh11+ MC |
| 10 | 106.734335 | 41.993837 | 13.997946 | SCG_65 | Penk+ Cdh11+ MC |
| 0 | 245.393376 | 73.691705 | 49.373442 | SCG_MTH4_C | Postn+ MC |
| 3 | 319.501086 | 58.897928 | 28.238732 | SCG_MTH3_C | Postn+ MC |
| 7 | 427.721365 | 111.000354 | 20.720066 | SCG_65 | Postn+ MC |
| 8 | 597.340550 | 71.300596 | 42.780358 | SCG_MTH2_C | Postn+ MC |
df = {}
ct_1 = "Postn+ MC"; ct_2 = "Penk+ Cdh11+ MC"; gene = "Postn"
df[f"{ct_1} vs. {ct_2} ({gene})"] = only_MC_DE["DE"][ct_1][ct_2][only_MC_DE["DE"][ct_1][ct_2].id == gene].padj[0]
ct_1 = "Postn+ MC"; ct_2 = "Cd248+ MC"; gene = "Postn"
df[f"{ct_1} vs. {ct_2} ({gene})"] = only_MC_DE["DE"][ct_1][ct_2][only_MC_DE["DE"][ct_1][ct_2].id == gene].padj[0]
ct_1 = "Penk+ Cdh11+ MC"; ct_2 = "Postn+ MC"; gene = "Penk"
df[f"{ct_1} vs. {ct_2} ({gene})"] = only_MC_DE["DE"][ct_1][ct_2][only_MC_DE["DE"][ct_1][ct_2].id == gene].padj[0]
ct_1 = "Penk+ Cdh11+ MC"; ct_2 = "Cd248+ MC"; gene = "Penk"
df[f"{ct_1} vs. {ct_2} ({gene})"] = only_MC_DE["DE"][ct_1][ct_2][only_MC_DE["DE"][ct_1][ct_2].id == gene].padj[0]
ct_1 = "Cd248+ MC"; ct_2 = "Postn+ MC"; gene = "Cd248"
df[f"{ct_1} vs. {ct_2} ({gene})"] = only_MC_DE["DE"][ct_1][ct_2][only_MC_DE["DE"][ct_1][ct_2].id == gene].padj[0]
ct_1 = "Cd248+ MC"; ct_2 = "Penk+ Cdh11+ MC"; gene = "Cd248"
df[f"{ct_1} vs. {ct_2} ({gene})"] = only_MC_DE["DE"][ct_1][ct_2][only_MC_DE["DE"][ct_1][ct_2].id == gene].padj[0]
df = pd.DataFrame(df, index=["p_adj"]).T
df
| p_adj | |
|---|---|
| Postn+ MC vs. Penk+ Cdh11+ MC (Postn) | 2.697548e-05 |
| Postn+ MC vs. Cd248+ MC (Postn) | 1.912401e-11 |
| Penk+ Cdh11+ MC vs. Postn+ MC (Penk) | 1.918110e-02 |
| Penk+ Cdh11+ MC vs. Cd248+ MC (Penk) | 3.306198e-04 |
| Cd248+ MC vs. Postn+ MC (Cd248) | 7.932441e-36 |
| Cd248+ MC vs. Penk+ Cdh11+ MC (Cd248) | 1.853144e-17 |
df_counts = only_MC_DE["normalized counts"].loc[["Postn", "Penk", "Cd248"]].T
df_counts["sample_id"] = [index.split(" | ")[0] for index in df_counts.index]
df_counts["cell_type"] = [index.split(" | ")[1] for index in df_counts.index]
df_counts.index = range(len(df_counts))
df_counts.sort_values(by="cell_type")
| id | Postn | Penk | Cd248 | sample_id | cell_type |
|---|---|---|---|---|---|
| 1 | 12.187411 | 76.900846 | 369.398705 | SCG_MTH4_C | Cd248+ MC |
| 5 | 0.000000 | 6.235403 | 318.005568 | SCG_MTH2_C | Cd248+ MC |
| 6 | 18.377762 | 45.460780 | 336.603224 | SCG_MTH3_C | Cd248+ MC |
| 11 | 5.453248 | 33.628366 | 400.813762 | SCG_65 | Cd248+ MC |
| 2 | 13.531701 | 441.033212 | 87.956055 | SCG_MTH4_C | Penk+ Cdh11+ MC |
| 4 | 20.595074 | 186.499838 | 50.343515 | SCG_MTH3_C | Penk+ Cdh11+ MC |
| 9 | 73.177542 | 297.431301 | 59.014147 | SCG_MTH2_C | Penk+ Cdh11+ MC |
| 10 | 19.247175 | 313.204031 | 59.491269 | SCG_65 | Penk+ Cdh11+ MC |
| 0 | 162.121750 | 141.488073 | 28.739765 | SCG_MTH4_C | Postn+ MC |
| 3 | 476.024346 | 26.625091 | 21.784165 | SCG_MTH3_C | Postn+ MC |
| 7 | 276.760883 | 39.960127 | 47.360151 | SCG_65 | Postn+ MC |
| 8 | 533.962242 | 58.624935 | 11.091204 | SCG_MTH2_C | Postn+ MC |
GO_BP_mouse = pd.read_csv(f"{PROJECT_DIR}/data/GSEA/gene_sets/MSigDB_mouse_GO-BP.tsv", sep = "\t")
GO_CC_mouse = pd.read_csv(f"{PROJECT_DIR}/data/GSEA/gene_sets/MSigDB_mouse_GO-CC.tsv", sep = "\t")
GO_MF_mouse = pd.read_csv(f"{PROJECT_DIR}/data/GSEA/gene_sets/MSigDB_mouse_GO-MF.tsv", sep = "\t")
GO_mouse = pd.concat([GO_BP_mouse, GO_CC_mouse, GO_MF_mouse])
gene_sets_GO = {}
for gs in tqdm(set(GO_mouse.gs_name)):
gene_sets_GO[gs] = list(GO_mouse[GO_mouse.gs_name == gs].gene_symbol)
adata_mouse_MC = sc.read_h5ad(f"{PROJECT_DIR}/data/Mouse_MC_conos.h5ad")
adata_mouse_MC = adata_mouse_MC[adata_mouse_MC.obs.condition == "control"]
Postn_DESeq = get_DESeq(adata_mouse_MC, group="Postn+ MC").dropna().sort_values(by="log2FoldChange", ascending=False)
Cdh11_DESeq = get_DESeq(adata_mouse_MC, group="Penk+ Cdh11+ MC").dropna().sort_values(by="log2FoldChange", ascending=False)
Cd248_DESeq = get_DESeq(adata_mouse_MC, group="Cd248+ MC").dropna().sort_values(by="log2FoldChange", ascending=False)
Cd248_GSEA = get_GSEA(Cd248_DESeq, gene_sets_GO.copy(), outdir=f"{PROJECT_DIR}/data/GSEA/Cd248/")
Cd248_GSEA.to_csv(f"{PROJECT_DIR}/data/GSEA/Cd248.tsv", sep="\t")
Cdh11_GSEA = get_GSEA(Cdh11_DESeq, gene_sets_GO.copy(), outdir=f"{PROJECT_DIR}/data/GSEA/Cdh11/")
Cdh11_GSEA.to_csv(f"{PROJECT_DIR}/data/GSEA/Cdh11.tsv", sep="\t")
Postn_GSEA = get_GSEA(Postn_DESeq, gene_sets_GO.copy(), outdir=f"{PROJECT_DIR}/data/GSEA/Postn/")
Postn_GSEA.to_csv(f"{PROJECT_DIR}/data/GSEA/Postn.tsv", sep="\t")
adata_Cd248 = adata_mouse_MC[adata_mouse_MC.obs["cell_type_l2"] == "Cd248+ MC"]
adata_Cdh11 = adata_mouse_MC[adata_mouse_MC.obs["cell_type_l2"] == "Penk+ Cdh11+ MC"]
adata_Postn = adata_mouse_MC[adata_mouse_MC.obs["cell_type_l2"] == "Postn+ MC"]
Postn_DESeq_IR = get_DESeq(adata_Postn, label="condition", group="irradiated", reference="control").dropna().sort_values(by="log2FoldChange", ascending=False)
Cdh11_DESeq_IR = get_DESeq(adata_Cdh11, label="condition", group="irradiated", reference="control").dropna().sort_values(by="log2FoldChange", ascending=False)
Cd248_DESeq_IR = get_DESeq(adata_Cd248, label="condition", group="irradiated", reference="control").dropna().sort_values(by="log2FoldChange", ascending=False)
Postn_DESeq_IR.to_csv(f"{PROJECT_DIR}/data/GSEA/DEGs_IR/Postn_IR_DEGs.tsv", sep="\t")
Cdh11_DESeq_IR.to_csv(f"{PROJECT_DIR}/data/GSEA/DEGs_IR/Cdh11_IR_DEGs.tsv", sep="\t")
Cd248_DESeq_IR.to_csv(f"{PROJECT_DIR}/data/GSEA/DEGs_IR/Cd248_IR_DEGs.tsv", sep="\t")
Cd248_GSEA_IR = get_GSEA(Cd248_DESeq_IR, gene_sets_GO, outdir=f"{PROJECT_DIR}/data/GSEA/Cd248/")
Cd248_GSEA_IR.to_csv(f"{PROJECT_DIR}/data/GSEA/Cd248_IR.tsv", sep="\t")
Cdh11_GSEA_IR = get_GSEA(Cdh11_DESeq_IR, gene_sets_GO, outdir=f"{PROJECT_DIR}/data/GSEA/Cdh11/")
Cdh11_GSEA_IR.to_csv(f"{PROJECT_DIR}/data/GSEA/Cdh11_IR.tsv", sep="\t")
Postn_GSEA_IR = get_GSEA(Postn_DESeq_IR, gene_sets_GO, outdir=f"{PROJECT_DIR}/data/GSEA/Postn/")
Postn_GSEA_IR.to_csv(f"{PROJECT_DIR}/data/GSEA/Postn_IR.tsv", sep="\t")
Postn_DESeq_aged = get_DESeq(adata_Postn, label="condition", group="aged", reference="control").dropna().sort_values(by="log2FoldChange", ascending=False)
Cdh11_DESeq_aged = get_DESeq(adata_Cdh11, label="condition", group="aged", reference="control").dropna().sort_values(by="log2FoldChange", ascending=False)
Cd248_DESeq_aged = get_DESeq(adata_Cd248, label="condition", group="aged", reference="control").dropna().sort_values(by="log2FoldChange", ascending=False)
Cd248_GSEA_aged = get_GSEA(Cd248_DESeq_aged, gene_sets_GO.copy(), outdir=f"{PROJECT_DIR}/data/GSEA/Cd248/")
Cd248_GSEA_aged.to_csv(f"{PROJECT_DIR}/data/GSEA/Cd248_aged.tsv", sep="\t")
Cdh11_GSEA_aged = get_GSEA(Cdh11_DESeq_aged, gene_sets_GO.copy(), outdir=f"{PROJECT_DIR}/data/GSEA/Cdh11/")
Cdh11_GSEA_aged.to_csv(f"{PROJECT_DIR}/data/GSEA/Cdh11_aged.tsv", sep="\t")
Postn_GSEA_aged = get_GSEA(Postn_DESeq_aged, gene_sets_GO.copy(), outdir=f"{PROJECT_DIR}/data/GSEA/Postn/")
Postn_GSEA_aged.to_csv(f"{PROJECT_DIR}/data/GSEA/Postn_aged.tsv", sep="\t")